☑ A quick guide to Data Cleaning

In [174]:
import warnings
warnings.filterwarnings('ignore')

libraries¶

In [133]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder 

datasets¶

In [175]:
specifications=pd.read_csv("./datasets/specifications.csv")
annual_sales=pd.read_csv("./datasets/annual_sales.csv")
df=pd.concat([specifications,annual_sales], axis=1, join="inner")

exploratory data analysis¶

removing strings from digits¶

In [176]:
for i in ['Minimum_Turning_Radius','Ex-Showroom_Price','Displacement',
         'Fuel_Tank_Capacity','Height','Width','Length','Front_Track','Rear_Track',
         'City_Mileage','Highway_Mileage','ARAI_Certified_Mileage','ARAI_Certified_Mileage_for_CNG',
         'Kerb_Weight','Ground_Clearance','Wheelbase','Boot_Space']:
    df[i]=df[i].str.replace(r'\D','')
    df[i]=pd.to_numeric(df[i])
for i in range(len(df.Basic_Warranty)):
    if type(df.Basic_Warranty[i])==str:df.Basic_Warranty[i]=int(df.Basic_Warranty[i][0])
df.tail(3)    
Out[176]:
Make Model Variant Ex-Showroom_Price Displacement Cylinders Valves_Per_Cylinder Drivetrain Cylinder_Configuration Emission_Norm ... 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001
770 Ford Endeavour 2.2L 4X2 Mt Titanium 2920000 2198.0 4.0 4.0 RWD (Rear Wheel Drive) In-line BS IV ... 1447 1067 1059 1423 1347 1524 963 626 119 0
771 Mini Cooper 3 Door Cooper D 2990000 1496.0 3.0 4.0 FWD (Front Wheel Drive) In-line BS IV ... 6392 3633 1879 3232 3547 4614 3901 1507 0 0
772 Mini Cooper 3 Door Cooper S 3420000 1998.0 4.0 4.0 FWD (Front Wheel Drive) In-line BS IV ... 0 0 0 0 0 0 0 0 0 0

3 rows × 160 columns

drop unnecessary columnns¶

In [177]:
df.drop(['Drivetrain',
 'Cylinder_Configuration',
 'Emission_Norm',
 'Engine_Location','Fuel_System','ARAI_Certified_Mileage',
 'ARAI_Certified_Mileage_for_CNG','Ground_Clearance',
 'Front_Brakes',
 'Rear_Brakes',
 'Front_Suspension',
 'Rear_Suspension',
 'Front_Track',
 'Rear_Track',
 'Front_Tyre_&_Rim',
 'Rear_Tyre_&_Rim',
 'Power_Steering',
 'Power_Windows',
 'Power_Seats',
 'Keyless_Entry','Torque',
 'Odometer','Tachometer',
 'Tripmeter','Start_/_Stop_Button',
 '12v_Power_Outlet','Aux-in_Compatibility',
 'Average_Fuel_Consumption','Boot-lid_Opener',
 'Boot_Space','Central_Locking',
 'Child_Safety_Locks',
 'Clock',
 'Cup_Holders',
 'Distance_to_Empty',
 'Door_Pockets',
 'Engine_Malfunction_Light',
 'Extended_Warranty',
 'FM_Radio',
 'Fuel-lid_Opener',
 'Fuel_Gauge',
 'Handbrake',
 'Instrument_Console',
 'Low_Fuel_Warning',
 'Minimum_Turning_Radius',
 'Multifunction_Display',
 'Sun_Visor',
 'Third_Row_AC_Vents',
 'Ventilation_System',
 'Auto-Dimming_Rear-View_Mirror',
 'Hill_Assist',
 'Gear_Indicator',
 '3_Point_Seat-Belt_in_Middle_Rear_Seat',
 'Ambient_Lightning',
 'Cargo/Boot_Lights',
 'Drive_Modes',
 'Engine_Immobilizer',
 'High_Speed_Alert_System',
 'Lane_Watch_Camera/_Side_Mirror_Camera',
 'Passenger_Side_Seat-Belt_Reminder',
 'Seat_Back_Pockets','Headlight_Reminder',
 'Adjustable_Headrests',
 'Gross_Vehicle_Weight','Door_Ajar_Warning',
 'EBD_(Electronic_Brake-force_Distribution)',
 'Fasten_Seat_Belt_Warning',
 'Gear_Shift_Reminder',
 'Number_of_Airbags',
 'Compression_Ratio',
 'Adjustable_Steering_Column',
 'Other_Specs',
 'Other_specs','Key_Off_Reminder',
 'USB_Compatibility',
 'Android_Auto',
 'Apple_CarPlay',
 'Cigarette_Lighter',
 'Infotainment_Screen',
 'Multifunction_Steering_Wheel',
 'Average_Speed',
 'EBA_(Electronic_Brake_Assist)',
 'Seat_Height_Adjustment',
 'Navigation_System',
 'Second_Row_AC_Vents',
 'Tyre_Pressure_Monitoring_System',
 'Rear_Center_Armrest',
 'iPod_Compatibility',
 'ESP_(Electronic_Stability_Program)',
 'Cooled_Glove_Box',
 'Recommended_Tyre_Pressure',
 'Heated_Seats',
 'Turbocharger',
 'ISOFIX_(Child-Seat_Mount)',
 'Rain_Sensing_Wipers',
 'Paddle_Shifters',
 'Leather_Wrapped_Steering',
 'Automatic_Headlamps',
 'Engine_Type',
 'ASR_/_Traction_Control',
 'Cruise_Control',
 'USB_Ports',
 'Heads-Up_Display',
 'Welcome_Lights',
 'Battery',
 'Electric_Range'], axis=1,inplace=True)
In [178]:
df.columns
Out[178]:
Index(['Make', 'Model', 'Variant', 'Ex-Showroom_Price', 'Displacement',
       'Cylinders', 'Valves_Per_Cylinder', 'Fuel_Tank_Capacity', 'Fuel_Type',
       'Height', 'Length', 'Width', 'Body_Type', 'Doors', 'City_Mileage',
       'Highway_Mileage', 'Kerb_Weight', 'Gears', 'Power', 'Speedometer',
       'Seating_Capacity', 'Seats_Material', 'Type', 'Wheelbase',
       'Wheels_Size', 'Audiosystem', 'Basic_Warranty', 'Bluetooth',
       'CD_/_MP3_/_DVD_Player', 'Voice_Recognition', 'Walk_Away_Auto_Car_Lock',
       'ABS_(Anti-lock_Braking_System)', 'Airbags', 'Parking_Assistance',
       '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012',
       '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003',
       '2002', '2001'],
      dtype='object')

merging city and highway mileage by taking avg¶

In [179]:
df["Mileage"]=(df.City_Mileage+df.Highway_Mileage)/2
df.drop(['City_Mileage',
       'Highway_Mileage'],axis=1,inplace=True)
column = df.pop('Mileage')
df.insert(10, 'Mileage', column)
df.columns
Out[179]:
Index(['Make', 'Model', 'Variant', 'Ex-Showroom_Price', 'Displacement',
       'Cylinders', 'Valves_Per_Cylinder', 'Fuel_Tank_Capacity', 'Fuel_Type',
       'Height', 'Mileage', 'Length', 'Width', 'Body_Type', 'Doors',
       'Kerb_Weight', 'Gears', 'Power', 'Speedometer', 'Seating_Capacity',
       'Seats_Material', 'Type', 'Wheelbase', 'Wheels_Size', 'Audiosystem',
       'Basic_Warranty', 'Bluetooth', 'CD_/_MP3_/_DVD_Player',
       'Voice_Recognition', 'Walk_Away_Auto_Car_Lock',
       'ABS_(Anti-lock_Braking_System)', 'Airbags', 'Parking_Assistance',
       '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012',
       '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003',
       '2002', '2001'],
      dtype='object')

Parking assistance encoding¶

In [180]:
for i in range(len(df.Parking_Assistance)):
    if type(df.Parking_Assistance[i])!=str:
        df.Parking_Assistance[i]=0
    else:
        df.Parking_Assistance[i]=1
In [181]:
df.Parking_Assistance
Out[181]:
0      0
1      0
2      0
3      0
4      0
      ..
768    1
769    1
770    1
771    1
772    1
Name: Parking_Assistance, Length: 773, dtype: object

converting power into integer from string and removing units¶

In [182]:
for i in range(len(df.Power)):
        df.Power[i]=int(''.join(filter(str.isdigit, df.Power[i].split("@")[0])))

data profiling¶

categorical data encoding¶

In [183]:
#using oneHotEncoding
def ohe(c93):
    merged=copy.deepcopy(c93)
    for i in c93.columns:
        if c93[i].dtypes==object:
            dummy=pd.get_dummies(c93[i],drop_first=True)
            merged=pd.concat([merged,dummy],axis='columns')
            merged.drop(i,axis=1,inplace=True)
    return merged        

#-------------------------------------------------------------
def le1(c93):
    lbcode = LabelEncoder()
    merged=copy.deepcopy(c93)

    for i in c93.columns:
        if c93[i].dtypes==object:
            merged[i] = lbcode.fit_transform(merged[i])
    return merged

#--------------------------------------------------------------
def le2(c93):
    orcode = OrdinalEncoder()
    
    for i in c93.columns:
        if c93[i].dtypes==object:
            c93_encode = orcode.fit_transform(c93[[i]])
            print(i,c93_encode,'\n')
#--------------------------------------------------------------    
def le3(c93):
    ohe=OneHotEncoder(sparse=False)
    for i in c93.columns:
        if c93[i].dtypes==object:
            arr = ohe.fit_transform(c93[[i]])
            print(i,arr,'\n')            

label encoding all the categorical data¶

In [184]:
le=LabelEncoder()
for i in df.columns:
        if df[i].dtypes==object:
            df[i]=le.fit_transform(df[[i]])

handling missing values¶

drop rows¶
simple inputer¶
logistic regresssion¶
knn imputer¶
In [185]:
from sklearn.impute import KNNImputer
imp = KNNImputer(n_neighbors=5)
for i in df.columns:
    if df[i].dtype!=object:df[i]=imp.fit_transform(np.array(df[i]).reshape(-1,1))

handling outliers¶

In [173]:
# Scatterplot Matrix
from pandas.plotting import scatter_matrix
scatter_matrix(df,figsize=(50,50))
plt.show()

iqr¶

In [269]:
#choose multiplier
#drop rows with outliers
In [172]:
"""
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
multiplier=10
df = df[~((df < (Q1 - multiplier * IQR)) |(df > (Q3 + multiplier * IQR))).any(axis=1)]
df.shape
"""

# this can also be done for a specific column instead of the entire dataframe.
# the columns can be chosen based on the above scatterplots
Out[172]:
'\nQ1 = df.quantile(0.25)\nQ3 = df.quantile(0.75)\nIQR = Q3 - Q1\nmultiplier=10\ndf = df[~((df < (Q1 - multiplier * IQR)) |(df > (Q3 + multiplier * IQR))).any(axis=1)]\ndf.shape\n'
In [171]:
"""
z=stats.zscore(df.Power)
df.Power[(z < 3)]
# drop those rows...
"""
Out[171]:
'\nz=stats.zscore(df.Power)\ndf.Power[(z < 3)]\n# drop those rows...\n'

Standard scaling, feature reduction using pca, normalization and data smoothening is not recommended since it may cause loss of important information such as the price of each car and its sales

correlation matrices & heatmaps¶

Generally, correlation matrices must be formed after normalizing/standard scaling the data, but that can lead to loss of important information, especially for the automotive industry. Hence, a copy of the dataset caan be created and the correlations can be formed by scaling or normalizing that copy

In [191]:
import copy
copy=copy.deepcopy(df)

data smoothening¶

In [192]:
alpha=0.3
copy=copy.ewm(alpha=alpha).mean()
copy
Out[192]:
Make Model Variant Ex-Showroom_Price Displacement Cylinders Valves_Per_Cylinder Fuel_Tank_Capacity Fuel_Type Height ... 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001
0 32.000000 91.000000 592.000000 2.926670e+05 624.000000 2.000000 2.000000 24.000000 5.000000 1652.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1 32.000000 91.000000 579.058824 2.595964e+05 624.000000 2.000000 2.000000 24.000000 5.000000 1652.000000 ... 538.235294 450.588235 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 32.000000 91.000000 428.347032 2.765209e+05 624.000000 2.000000 2.000000 19.890411 2.716895 1652.000000 ... 292.465753 244.840183 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
3 32.000000 91.000000 495.324122 2.995162e+05 624.000000 2.000000 2.000000 21.512831 3.618239 1652.000000 ... 177.003553 148.180024 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 32.000000 91.000000 524.777037 2.896741e+05 624.000000 2.000000 2.000000 22.409722 4.116512 1652.000000 ... 148.153691 156.769680 26.684937 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
768 26.372926 54.595731 223.256613 3.242351e+06 2826.326740 4.329442 3.999994 78.992091 2.781301 1814.006580 ... 954.397095 1091.856904 1169.541079 867.333827 887.630045 1012.000418 893.780017 737.903565 700.853131 505.631516
769 21.761048 52.617012 191.379629 3.239546e+06 2637.828718 4.230610 3.999996 79.294463 2.546911 1820.904606 ... 2032.477966 1776.199833 911.078755 607.133679 621.341031 708.400292 625.646012 516.532496 490.597192 353.942061
770 18.532734 51.231908 169.365740 3.143682e+06 2505.880102 4.161427 3.999997 79.506124 2.382837 1825.733224 ... 1856.834577 1563.439883 955.455129 851.893575 839.038722 953.080205 726.852208 549.372747 379.118034 247.759443
771 20.772914 46.062336 184.556018 3.097577e+06 2202.916072 3.812999 3.999998 68.854287 2.267986 1702.213257 ... 3217.384204 2184.307918 1232.518590 1565.925503 1651.427105 2051.356143 1679.096546 836.660923 265.382624 173.431610
772 22.341040 42.443635 195.489213 3.194304e+06 2141.441250 3.869099 3.999999 61.398001 3.087590 1615.749280 ... 2252.168943 1529.015543 862.763013 1096.147852 1155.998974 1435.949300 1175.367582 585.662646 185.767837 121.402127

773 rows × 53 columns

standard scaling¶

In [194]:
from sklearn.preprocessing import StandardScaler
scale= StandardScaler()
for i in copy.columns:
    copy[i]=scale.fit_transform(np.array(copy[i]).reshape(-1,1)) 
copy
Out[194]:
Make Model Variant Ex-Showroom_Price Displacement Cylinders Valves_Per_Cylinder Fuel_Tank_Capacity Fuel_Type Height ... 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001
0 1.420945 0.574255 1.647875 -0.319187 -1.254463 -1.662544 -3.874900 -0.569832 1.667709 0.503058 ... -0.608217 -0.575313 -0.587975 -0.582323 -0.583807 -0.602112 -0.605540 -0.573133 -0.561638 -0.582211
1 1.420945 0.574255 1.561225 -0.322632 -1.254463 -1.662544 -3.874900 -0.569832 1.667709 0.503058 ... -0.456617 -0.448079 -0.587975 -0.582323 -0.583807 -0.602112 -0.605540 -0.573133 -0.561638 -0.582211
2 1.420945 0.574255 0.552106 -0.320869 -1.254463 -1.662544 -3.874900 -0.642833 -0.837321 0.503058 ... -0.525841 -0.506177 -0.587975 -0.582323 -0.583807 -0.602112 -0.605540 -0.573133 -0.561638 -0.582211
3 1.420945 0.574255 1.000564 -0.318474 -1.254463 -1.662544 -3.874900 -0.614013 0.151637 0.503058 ... -0.558362 -0.533471 -0.587975 -0.582323 -0.583807 -0.602112 -0.605540 -0.573133 -0.561638 -0.582211
4 1.420945 0.574255 1.197771 -0.319499 -1.254463 -1.662544 -3.874900 -0.598081 0.698343 0.503058 ... -0.566488 -0.531045 -0.579884 -0.582323 -0.583807 -0.602112 -0.605540 -0.573133 -0.561638 -0.582211
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
768 0.636228 -0.512668 -0.821116 -0.011916 1.342880 0.125611 0.071935 0.407020 -0.766654 1.977005 ... -0.339401 -0.267003 -0.233337 -0.326361 -0.264434 -0.141051 -0.092530 -0.046686 0.136211 0.261836
769 -0.006916 -0.571747 -1.034554 -0.012208 1.120573 0.049743 0.071939 0.412392 -1.023828 2.039764 ... -0.035748 -0.073763 -0.311710 -0.403149 -0.360246 -0.279369 -0.246433 -0.204620 -0.073144 0.008622
770 -0.457117 -0.613102 -1.181952 -0.022194 0.964957 -0.003364 0.071941 0.416151 -1.203850 2.083695 ... -0.085220 -0.133841 -0.298254 -0.330917 -0.281918 -0.167895 -0.188343 -0.181190 -0.184145 -0.168628
771 -0.144716 -0.767450 -1.080243 -0.026997 0.607653 -0.270828 0.071943 0.226937 -1.329865 0.959902 ... 0.297994 0.041475 -0.214240 -0.120197 0.010383 0.332473 0.358224 0.023772 -0.297393 -0.292703
772 0.073966 -0.875494 -1.007037 -0.016921 0.535152 -0.227764 0.071945 0.094488 -0.430593 0.173246 ... 0.026131 -0.143561 -0.326361 -0.258835 -0.167874 0.052097 0.069095 -0.155300 -0.376667 -0.379556

773 rows × 53 columns

correlation matrix and heatmap¶

In [197]:
print(copy.corr())
fig, ax = plt.subplots(figsize=(100,100))
sns.heatmap(copy.corr(), cmap="YlGnBu", ax=ax)
                                    Make     Model   Variant  \
Make                            1.000000  0.208475  0.253510   
Model                           0.208475  1.000000  0.085415   
Variant                         0.253510  0.085415  1.000000   
Ex-Showroom_Price              -0.115716 -0.018142  0.009160   
Displacement                   -0.075206  0.026471 -0.074365   
Cylinders                      -0.145499 -0.054522 -0.116865   
Valves_Per_Cylinder            -0.218304 -0.032458  0.023025   
Fuel_Tank_Capacity             -0.078310  0.038498 -0.024173   
Fuel_Type                      -0.021992 -0.077561  0.045562   
Height                          0.079839  0.140346  0.042257   
Mileage                        -0.002513 -0.028645 -0.062856   
Length                         -0.021101  0.100128 -0.226275   
Width                          -0.077805  0.169530 -0.136537   
Body_Type                      -0.077230  0.098208 -0.181694   
Doors                           0.139971  0.004283  0.087467   
Kerb_Weight                    -0.133579 -0.199085 -0.247041   
Gears                          -0.013906  0.035437 -0.076801   
Power                          -0.095095  0.032931 -0.173772   
Speedometer                    -0.120382  0.033813 -0.099986   
Seating_Capacity                0.126563  0.125840  0.041827   
Seats_Material                 -0.076378  0.175931 -0.284230   
Type                            0.033284 -0.086956  0.011190   
Wheelbase                      -0.036165  0.084354 -0.222958   
Wheels_Size                    -0.101490  0.203160 -0.171305   
Audiosystem                     0.061301 -0.277678  0.284214   
Basic_Warranty                  0.057714  0.124746 -0.094087   
Bluetooth                      -0.010305 -0.085201  0.152288   
CD_/_MP3_/_DVD_Player          -0.035144 -0.206113  0.085889   
Voice_Recognition               0.199049  0.045625  0.079042   
Walk_Away_Auto_Car_Lock         0.264290  0.023262  0.093925   
ABS_(Anti-lock_Braking_System) -0.033435 -0.034147  0.267067   
Airbags                        -0.031306  0.036009  0.170229   
Parking_Assistance             -0.105118 -0.009136 -0.145919   
2020                            0.015468  0.022929  0.135239   
2019                            0.005010 -0.025064  0.124310   
2018                           -0.008100 -0.039486  0.093668   
2017                           -0.022235 -0.041271  0.075335   
2016                           -0.022815 -0.060117  0.040416   
2015                           -0.010044 -0.066281  0.027096   
2014                           -0.008843 -0.060062  0.017133   
2013                           -0.011172 -0.064457  0.012024   
2012                           -0.009170 -0.057592  0.005299   
2011                           -0.017278 -0.054700 -0.031533   
2010                           -0.010799 -0.031360 -0.055540   
2009                           -0.004657 -0.036278 -0.024645   
2008                            0.007526 -0.060613 -0.058288   
2007                            0.002874 -0.053609 -0.059169   
2006                           -0.028341 -0.040278 -0.045639   
2005                           -0.022707 -0.047797 -0.040838   
2004                           -0.023089 -0.053334 -0.033640   
2003                            0.005454 -0.064037 -0.025720   
2002                            0.033423 -0.056388 -0.044466   
2001                            0.080530 -0.058918 -0.051113   

                                Ex-Showroom_Price  Displacement  Cylinders  \
Make                                    -0.115716     -0.075206  -0.145499   
Model                                   -0.018142      0.026471  -0.054522   
Variant                                  0.009160     -0.074365  -0.116865   
Ex-Showroom_Price                        1.000000      0.795188   0.838380   
Displacement                             0.795188      1.000000   0.902160   
Cylinders                                0.838380      0.902160   1.000000   
Valves_Per_Cylinder                      0.036746      0.141948   0.097594   
Fuel_Tank_Capacity                       0.338944      0.565761   0.490547   
Fuel_Type                                0.210892      0.004770   0.073650   
Height                                  -0.147219      0.133143  -0.072956   
Mileage                                  0.003464      0.076727   0.036918   
Length                                   0.384321      0.766015   0.623395   
Width                                    0.389666      0.730940   0.596703   
Body_Type                                0.007320      0.179813   0.162047   
Doors                                   -0.513613     -0.543241  -0.566929   
Kerb_Weight                             -0.043837     -0.063152  -0.030376   
Gears                                    0.290103      0.440566   0.452555   
Power                                    0.513142      0.820511   0.729666   
Speedometer                              0.100452      0.032931   0.033468   
Seating_Capacity                        -0.311992     -0.050832  -0.262279   
Seats_Material                           0.384793      0.598961   0.528216   
Type                                    -0.389581     -0.567440  -0.469751   
Wheelbase                                0.446363      0.786926   0.651565   
Wheels_Size                              0.407756      0.617190   0.520861   
Audiosystem                              0.067797     -0.017464   0.031136   
Basic_Warranty                           0.194254      0.219610   0.298713   
Bluetooth                               -0.143551     -0.323512  -0.323833   
CD_/_MP3_/_DVD_Player                   -0.136109     -0.325334  -0.320540   
Voice_Recognition                       -0.219807     -0.190832  -0.188798   
Walk_Away_Auto_Car_Lock                 -0.217931     -0.159362  -0.165533   
ABS_(Anti-lock_Braking_System)          -0.031729     -0.184391  -0.195045   
Airbags                                  0.108251     -0.022966   0.024250   
Parking_Assistance                       0.145808      0.346522   0.306858   
2020                                     0.116692      0.151386   0.108154   
2019                                     0.094731      0.137781   0.103406   
2018                                     0.099199      0.138692   0.108914   
2017                                     0.095631      0.121817   0.097039   
2016                                     0.072035      0.108117   0.080966   
2015                                     0.032750      0.074535   0.042942   
2014                                     0.011265      0.065275   0.030579   
2013                                     0.015755      0.069295   0.033503   
2012                                     0.019075      0.076272   0.029046   
2011                                     0.003830      0.058935   0.016894   
2010                                     0.001761      0.065749   0.017867   
2009                                    -0.013582      0.056858   0.002207   
2008                                    -0.024890      0.038771   0.002837   
2007                                    -0.006691      0.039776   0.012457   
2006                                     0.004146      0.044899   0.021810   
2005                                     0.022745      0.058172   0.032112   
2004                                     0.017314      0.048318   0.027538   
2003                                     0.020139      0.043726   0.020634   
2002                                     0.027058      0.046905   0.023154   
2001                                     0.043522      0.068427   0.031733   

                                Valves_Per_Cylinder  Fuel_Tank_Capacity  \
Make                                      -0.218304           -0.078310   
Model                                     -0.032458            0.038498   
Variant                                    0.023025           -0.024173   
Ex-Showroom_Price                          0.036746            0.338944   
Displacement                               0.141948            0.565761   
Cylinders                                  0.097594            0.490547   
Valves_Per_Cylinder                        1.000000            0.062792   
Fuel_Tank_Capacity                         0.062792            1.000000   
Fuel_Type                                  0.071125            0.032150   
Height                                    -0.071956            0.081234   
Mileage                                    0.002455            0.281389   
Length                                     0.228045            0.414888   
Width                                      0.182482            0.430216   
Body_Type                                  0.060577            0.083175   
Doors                                     -0.110578           -0.271047   
Kerb_Weight                               -0.009843           -0.039913   
Gears                                      0.193782            0.350491   
Power                                      0.259934            0.459501   
Speedometer                               -0.024578           -0.038011   
Seating_Capacity                           0.005090           -0.092388   
Seats_Material                             0.194238            0.400613   
Type                                      -0.248846           -0.343209   
Wheelbase                                  0.221225            0.413505   
Wheels_Size                                0.006568            0.372093   
Audiosystem                                0.025098           -0.018309   
Basic_Warranty                             0.092205            0.082739   
Bluetooth                                 -0.181965           -0.209915   
CD_/_MP3_/_DVD_Player                     -0.124616           -0.209993   
Voice_Recognition                         -0.113612           -0.018233   
Walk_Away_Auto_Car_Lock                   -0.122964           -0.016126   
ABS_(Anti-lock_Braking_System)            -0.243578           -0.118617   
Airbags                                   -0.178514            0.014427   
Parking_Assistance                         0.266482            0.206278   
2020                                       0.025518            0.002023   
2019                                       0.052571            0.001751   
2018                                       0.033120            0.001765   
2017                                       0.020534           -0.000950   
2016                                       0.004079           -0.013114   
2015                                       0.007216           -0.041144   
2014                                       0.008595           -0.047113   
2013                                       0.008817           -0.043714   
2012                                       0.015946           -0.037033   
2011                                       0.013049           -0.045157   
2010                                      -0.008751           -0.042896   
2009                                      -0.017025           -0.042815   
2008                                      -0.049046           -0.049399   
2007                                      -0.061956           -0.046491   
2006                                      -0.047748           -0.040973   
2005                                      -0.029689           -0.037917   
2004                                      -0.046000           -0.043629   
2003                                      -0.047788           -0.045429   
2002                                      -0.056333           -0.043184   
2001                                      -0.062636           -0.039925   

                                Fuel_Type    Height  ...      2010      2009  \
Make                            -0.021992  0.079839  ... -0.010799 -0.004657   
Model                           -0.077561  0.140346  ... -0.031360 -0.036278   
Variant                          0.045562  0.042257  ... -0.055540 -0.024645   
Ex-Showroom_Price                0.210892 -0.147219  ...  0.001761 -0.013582   
Displacement                     0.004770  0.133143  ...  0.065749  0.056858   
Cylinders                        0.073650 -0.072956  ...  0.017867  0.002207   
Valves_Per_Cylinder              0.071125 -0.071956  ... -0.008751 -0.017025   
Fuel_Tank_Capacity               0.032150  0.081234  ... -0.042896 -0.042815   
Fuel_Type                        1.000000 -0.445415  ... -0.087866 -0.101351   
Height                          -0.445415  1.000000  ...  0.066520  0.070962   
Mileage                          0.093195 -0.050732  ... -0.050324 -0.047328   
Length                          -0.262612  0.227007  ...  0.150035  0.147336   
Width                           -0.266009  0.254775  ...  0.076259  0.075177   
Body_Type                       -0.345959  0.136707  ...  0.080385  0.061599   
Doors                           -0.112946  0.333427  ...  0.074348  0.086086   
Kerb_Weight                     -0.041625 -0.091660  ... -0.082105 -0.077994   
Gears                            0.010405 -0.040043  ...  0.074695  0.068605   
Power                           -0.107643  0.111514  ...  0.103753  0.096023   
Speedometer                     -0.059259  0.196334  ... -0.017591 -0.042168   
Seating_Capacity                -0.465110  0.785823  ...  0.094159  0.104625   
Seats_Material                  -0.060493  0.004517  ...  0.000520 -0.004417   
Type                            -0.096328  0.148243  ... -0.058357 -0.063302   
Wheelbase                       -0.183828  0.193973  ...  0.121981  0.122759   
Wheels_Size                     -0.163078  0.320688  ...  0.079969  0.079446   
Audiosystem                      0.049507 -0.048633  ...  0.016289  0.020194   
Basic_Warranty                   0.060417 -0.051222  ...  0.115314  0.083417   
Bluetooth                        0.006477  0.124584  ... -0.069171 -0.078663   
CD_/_MP3_/_DVD_Player            0.099779  0.104813  ... -0.114508 -0.122052   
Voice_Recognition               -0.036740  0.108912  ... -0.001062  0.000229   
Walk_Away_Auto_Car_Lock         -0.094539  0.075527  ...  0.032562  0.037988   
ABS_(Anti-lock_Braking_System)  -0.027755  0.187992  ... -0.122439 -0.122888   
Airbags                          0.017158  0.101550  ... -0.130311 -0.131385   
Parking_Assistance              -0.033688 -0.058274  ...  0.083728  0.087305   
2020                            -0.113364  0.082212  ...  0.717282  0.704241   
2019                            -0.117445  0.072288  ...  0.773154  0.759953   
2018                            -0.109487  0.068365  ...  0.818546  0.803232   
2017                            -0.097590  0.057430  ...  0.854174  0.836179   
2016                            -0.084362  0.061403  ...  0.895144  0.878750   
2015                            -0.082972  0.067882  ...  0.923819  0.910711   
2014                            -0.084154  0.070001  ...  0.935694  0.923896   
2013                            -0.082491  0.073799  ...  0.948150  0.937108   
2012                            -0.075320  0.070946  ...  0.968559  0.957965   
2011                            -0.065654  0.054414  ...  0.983298  0.964497   
2010                            -0.087866  0.066520  ...  1.000000  0.982726   
2009                            -0.101351  0.070962  ...  0.982726  1.000000   
2008                            -0.101182  0.076766  ...  0.960176  0.968303   
2007                            -0.104886  0.060226  ...  0.937428  0.946010   
2006                            -0.098825  0.043237  ...  0.912960  0.929158   
2005                            -0.104021  0.054503  ...  0.889007  0.903935   
2004                            -0.122677  0.055633  ...  0.862551  0.884884   
2003                            -0.113994  0.053367  ...  0.844577  0.865202   
2002                            -0.101578  0.055531  ...  0.812641  0.827817   
2001                            -0.102704  0.092178  ...  0.759007  0.768025   

                                    2008      2007      2006      2005  \
Make                            0.007526  0.002874 -0.028341 -0.022707   
Model                          -0.060613 -0.053609 -0.040278 -0.047797   
Variant                        -0.058288 -0.059169 -0.045639 -0.040838   
Ex-Showroom_Price              -0.024890 -0.006691  0.004146  0.022745   
Displacement                    0.038771  0.039776  0.044899  0.058172   
Cylinders                       0.002837  0.012457  0.021810  0.032112   
Valves_Per_Cylinder            -0.049046 -0.061956 -0.047748 -0.029689   
Fuel_Tank_Capacity             -0.049399 -0.046491 -0.040973 -0.037917   
Fuel_Type                      -0.101182 -0.104886 -0.098825 -0.104021   
Height                          0.076766  0.060226  0.043237  0.054503   
Mileage                        -0.048655 -0.051815 -0.051611 -0.054745   
Length                          0.128840  0.118789  0.119867  0.132529   
Width                           0.050964  0.037554  0.048032  0.061676   
Body_Type                       0.068644  0.070442  0.072382  0.089699   
Doors                           0.109051  0.093499  0.081519  0.067409   
Kerb_Weight                    -0.082351 -0.087290 -0.087963 -0.090490   
Gears                           0.056241  0.051726  0.052661  0.043054   
Power                           0.081340  0.079487  0.089642  0.097329   
Speedometer                    -0.037170 -0.038611 -0.037986 -0.023125   
Seating_Capacity                0.111113  0.097771  0.076567  0.087438   
Seats_Material                 -0.011451 -0.007482  0.003252  0.011753   
Type                           -0.029787 -0.034858 -0.042300 -0.053888   
Wheelbase                       0.095477  0.086495  0.088693  0.099371   
Wheels_Size                     0.068786  0.065957  0.075106  0.075682   
Audiosystem                     0.016307  0.025672  0.032336  0.040048   
Basic_Warranty                  0.095063  0.080919  0.074584  0.076788   
Bluetooth                      -0.069034 -0.065213 -0.077171 -0.082368   
CD_/_MP3_/_DVD_Player          -0.119326 -0.122458 -0.137274 -0.136490   
Voice_Recognition               0.009189  0.017027  0.008508 -0.002410   
Walk_Away_Auto_Car_Lock         0.046752  0.047410  0.038037  0.014990   
ABS_(Anti-lock_Braking_System) -0.119655 -0.100853 -0.104947 -0.106098   
Airbags                        -0.144713 -0.130748 -0.131563 -0.133040   
Parking_Assistance              0.082925  0.086838  0.106175  0.122062   
2020                            0.683737  0.671682  0.672234  0.679483   
2019                            0.726329  0.711250  0.709864  0.712452   
2018                            0.769192  0.754423  0.748913  0.747245   
2017                            0.808206  0.792126  0.787450  0.784457   
2016                            0.851004  0.826012  0.813117  0.803531   
2015                            0.875979  0.846551  0.833550  0.819972   
2014                            0.891944  0.863522  0.853644  0.834372   
2013                            0.909792  0.880845  0.864802  0.847890   
2012                            0.929203  0.900370  0.881110  0.865411   
2011                            0.936751  0.910986  0.892245  0.874111   
2010                            0.960176  0.937428  0.912960  0.889007   
2009                            0.968303  0.946010  0.929158  0.903935   
2008                            1.000000  0.986006  0.958722  0.932426   
2007                            0.986006  1.000000  0.979841  0.953585   
2006                            0.958722  0.979841  1.000000  0.983086   
2005                            0.932426  0.953585  0.983086  1.000000   
2004                            0.906593  0.932364  0.965650  0.986727   
2003                            0.891694  0.918514  0.947832  0.970084   
2002                            0.867365  0.898841  0.922937  0.948436   
2001                            0.820410  0.851258  0.867647  0.895052   

                                    2004      2003      2002      2001  
Make                           -0.023089  0.005454  0.033423  0.080530  
Model                          -0.053334 -0.064037 -0.056388 -0.058918  
Variant                        -0.033640 -0.025720 -0.044466 -0.051113  
Ex-Showroom_Price               0.017314  0.020139  0.027058  0.043522  
Displacement                    0.048318  0.043726  0.046905  0.068427  
Cylinders                       0.027538  0.020634  0.023154  0.031733  
Valves_Per_Cylinder            -0.046000 -0.047788 -0.056333 -0.062636  
Fuel_Tank_Capacity             -0.043629 -0.045429 -0.043184 -0.039925  
Fuel_Type                      -0.122677 -0.113994 -0.101578 -0.102704  
Height                          0.055633  0.053367  0.055531  0.092178  
Mileage                        -0.055985 -0.054225 -0.054860 -0.058926  
Length                          0.131912  0.129719  0.139481  0.170235  
Width                           0.060364  0.055977  0.054802  0.072160  
Body_Type                       0.098220  0.108213  0.130932  0.137814  
Doors                           0.059932  0.056989  0.041730  0.047782  
Kerb_Weight                    -0.091942 -0.087428 -0.086561 -0.089753  
Gears                           0.040194  0.039300  0.037772  0.042717  
Power                           0.091737  0.090601  0.096138  0.117713  
Speedometer                    -0.023490 -0.028578 -0.036334 -0.037495  
Seating_Capacity                0.092796  0.096228  0.103968  0.148299  
Seats_Material                  0.014134  0.008102  0.010123  0.020130  
Type                           -0.050458 -0.046679 -0.039764 -0.023933  
Wheelbase                       0.103084  0.097378  0.104264  0.132096  
Wheels_Size                     0.068334  0.059143  0.063509  0.082791  
Audiosystem                     0.056643  0.058010  0.043599  0.035159  
Basic_Warranty                  0.070389  0.072765  0.075454  0.094425  
Bluetooth                      -0.085805 -0.096520 -0.085405 -0.056615  
CD_/_MP3_/_DVD_Player          -0.139706 -0.147393 -0.146553 -0.129998  
Voice_Recognition              -0.014284 -0.010566  0.010103  0.019162  
Walk_Away_Auto_Car_Lock         0.007052  0.020102  0.032556  0.039537  
ABS_(Anti-lock_Braking_System) -0.089296 -0.094677 -0.093962 -0.082036  
Airbags                        -0.121141 -0.124823 -0.123440 -0.124149  
Parking_Assistance              0.130746  0.132320  0.125227  0.117256  
2020                            0.665006  0.665127  0.645353  0.607763  
2019                            0.696147  0.688050  0.658355  0.610747  
2018                            0.731552  0.725606  0.694393  0.644986  
2017                            0.763901  0.758469  0.729586  0.681069  
2016                            0.782038  0.771788  0.737788  0.689043  
2015                            0.801705  0.789842  0.753937  0.703589  
2014                            0.815235  0.801889  0.765768  0.713647  
2013                            0.824896  0.810344  0.775736  0.724514  
2012                            0.840915  0.827944  0.795463  0.743372  
2011                            0.849997  0.836756  0.805357  0.750136  
2010                            0.862551  0.844577  0.812641  0.759007  
2009                            0.884884  0.865202  0.827817  0.768025  
2008                            0.906593  0.891694  0.867365  0.820410  
2007                            0.932364  0.918514  0.898841  0.851258  
2006                            0.965650  0.947832  0.922937  0.867647  
2005                            0.986727  0.970084  0.948436  0.895052  
2004                            1.000000  0.985468  0.956465  0.899610  
2003                            0.985468  1.000000  0.984176  0.938277  
2002                            0.956465  0.984176  1.000000  0.973790  
2001                            0.899610  0.938277  0.973790  1.000000  

[53 rows x 53 columns]
Out[197]:
<AxesSubplot:>

box plot¶

In [202]:
plt.scatter(df.Height,df.Displacement)
Out[202]:
<matplotlib.collections.PathCollection at 0x259811a1730>
In [198]:
 sns.boxplot(x=df.Height)
Out[198]:
<AxesSubplot:xlabel='Height'>

convert the final dataframe into csv file¶

In [200]:
df.to_csv("cars.csv")

make more sample datasets¶

In [199]:
specifications=pd.read_csv("./datasets/specifications.csv").sample(frac = 1)
annual_sales=pd.read_csv("./datasets/annual_sales.csv").sample(frac = 1)
df_new=pd.concat([specifications,annual_sales], axis=1, join="inner")

now repeat all the cleaning steps on this new dataframe